This is a cox proportional hazards model on data from NHANES I with followup mortality data from the NHANES I Epidemiologic Followup Study. It is designed to illustrate how through the use of SHAP values we can interpret XGBoost models where traditionally only linear models are used. We see interesting and non-linear patterns in the data, which suggest the potential of this approach. Keep in mind the data has not yet been checked by us for calibrations to current lab tests and so you should not consider this as rock solid medical insights, but rather just as a proof of concept.
Note that XGBoost only recently got support for a Cox objective, so you will need the most recent version of master.
In [1]:
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
import xgboost
import numpy as np
import shap
import scipy as sp
import time
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [13]:
X,y = shap.datasets.nhanesi()
X_display,y_display = shap.datasets.nhanesi(display=True)
X = X.fillna(X.mean())
def sort_data(X, y):
sinds = np.argsort(np.abs(y))
return X.iloc[sinds,:],y[sinds]
# create a complete dataset
#X,y = sort_data(X, np.array(y))
xgb_full = xgboost.DMatrix(X.as_matrix(), label=y)
# create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
#X_train,y_train = sort_data(X_train, y_train)
xgb_train = xgboost.DMatrix(X_train, label=y_train)
#X_test,y_test = sort_data(X_test, y_test)
xgb_test = xgboost.DMatrix(X_test, label=y_test)
In [117]:
X,y = shap.datasets.diabetes()
xgb_full = xgboost.DMatrix(X.as_matrix(), label=y)
In [262]:
# train final model on the full data set
params = {
"eta": 0.05,
"max_depth": 3,
"objective": "reg:linear",
"subsample": 0.5
}
model = xgboost.train(params, xgb_full, 700, evals = [(xgb_full, "test")], verbose_eval=1000)
In [185]:
# train final model on the full data set
params = {
"eta": 0.05,
"max_depth": 1,
"objective": "reg:linear",
"subsample": 0.5
}
model = xgboost.cv(params, xgboost.DMatrix(Xm, y), 1000, verbose_eval=100)
model = xgboost.train(params, xgboost.DMatrix(Xm, y), 300, verbose_eval=100)
In [186]:
shap_values2 = shap.TreeExplainer(model).shap_values(Xm)
In [195]:
pl.plot(Xm[:,8], y, ".")
Out[195]:
In [192]:
shap.dependence_plot(8, shap_values2, X)
In [190]:
for i in range(X.shape[1]):
shap.dependence_plot(i, shap_values2, X)
In [14]:
# use validation set to choose # of trees
# params = {
# "eta": 0.001,
# "max_depth": 3,
# "objective": "survival:cox",
# "subsample": 0.5
# }
# model_train = xgboost.train(params, xgb_train, 10000, evals = [(xgb_test, "test")], verbose_eval=1000)
In [15]:
# train final model on the full data set
params = {
"eta": 0.001,
"max_depth": 3,
"objective": "survival:cox",
"subsample": 0.5
}
model = xgboost.train(params, xgb_full, 7000, evals = [(xgb_full, "test")], verbose_eval=1000)
In [122]:
start = time.time()
shap_values = model.predict(xgb_full, pred_contribs=True)
print(time.time() - start)
In [6]:
shap_values.shape
Out[6]:
In [16]:
def f(x):
return model.predict(xgboost.DMatrix(x))
In [ ]:
?shap.force_plot
In [18]:
model.predict(xgboost.DMatrix(np.zeros((1,X.shape[1]))))
Out[18]:
In [19]:
explainer = shap.KernelExplainer(f, np.zeros((1,X.shape[1])))
In [ ]:
explainer = shap.KernelExplainer(f, X.median().as_matrix()[inds[:10],:])
In [31]:
inds = np.arange(X.shape[0])
np.random.shuffle(inds)
explainer = shap.KernelExplainer(f, X.as_matrix()[inds[:10],:])
In [33]:
6 * X.shape[0]
Out[33]:
In [42]:
j = 0
inds = np.arange(X.shape[1])
np.random.shuffle(inds)
np.where(inds == j)[0][0]
Out[42]:
In [ ]:
inds = np.arange()
In [44]:
[i for i in range(0, 10, 2)]
Out[44]:
In [67]:
def ime(j, f, x, X, nsamples=10):
assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
X_masked = np.zeros((nsamples, X.shape[1]))
inds = np.arange(X.shape[1])
for i in range(0, nsamples//2):
pos = np.where(inds == j)[0][0]
rind = np.random.randint(X.shape[0])
X_masked[i,:] = x
X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
# print(i)
# print(-(i+1))
X_masked[-(i+1),:] = x
X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
# print(X_masked)
evals = f(X_masked)
return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [8]:
from iml.common import convert_to_instance, convert_to_model, match_instance_to_data, match_model_to_data, convert_to_instance_with_index
from iml.explanations import AdditiveExplanation
from iml.links import convert_to_link, IdentityLink
from iml.datatypes import convert_to_data, DenseData
import logging
from iml.explanations import AdditiveExplanation
log = logging.getLogger('shap')
from shap import KernelExplainer
In [62]:
import matplotlib.pylab as pl
In [9]:
from shap import KernelExplainer
In [315]:
class IMEExplainer(KernelExplainer):
""" This is an implementation of the IME explanation method (aka. Shapley sampling values)
This is implemented here for comparision and evaluation purposes, the KernelExplainer is
typically more efficient and so is the preferred model agnostic estimation method in this package.
IME was proposed in "An Efficient Explanation of Individual Classifications using Game Theory",
Erik Štrumbelj, Igor Kononenko, JMLR 2010
"""
def __init__(self, model, data, **kwargs):
# silence warning about large datasets
level = log.level
log.setLevel(logging.ERROR)
super(IMEExplainer, self).__init__(model, data, **kwargs)
log.setLevel(level)
def explain(self, incoming_instance, **kwargs):
# convert incoming input to a standardized iml object
instance = convert_to_instance(incoming_instance)
match_instance_to_data(instance, self.data)
# pick a reasonable number of samples if the user didn't specify how many they wanted
self.nsamples = kwargs.get("nsamples", 0)
if self.nsamples == 0:
self.nsamples = 1000 * self.P
# divide up the samples among the features
self.nsamples_each = np.ones(self.P, dtype=np.int64) * 2 * (self.nsamples // (self.P * 2))
for i in range((self.nsamples % (self.P * 2)) // 2):
self.nsamples_each[i] += 2
model_out = self.model.f(instance.x)
# explain every feature
phi = np.zeros(self.P)
self.X_masked = np.zeros((self.nsamples_each.max(), X.shape[1]))
for i in range(self.P):
phi[i] = self.ime(i, self.model.f, instance.x, self.data.data, nsamples=self.nsamples_each[i])
phi = np.array(phi)
return AdditiveExplanation(self.link.f(1), self.link.f(1), phi, np.zeros(len(phi)), instance, self.link,
self.model, self.data)
def ime(self, j, f, x, X, nsamples=10):
assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
# print("nsamples", nsamples, j)
X_masked = self.X_masked[:nsamples,:]
inds = np.arange(X.shape[1])
for i in range(0, nsamples//2):
np.random.shuffle(inds)
pos = np.where(inds == j)[0][0]
rind = np.random.randint(X.shape[0])
X_masked[i,:] = x
X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
X_masked[-(i+1),:] = x
X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
evals = f(X_masked)
evals_on = evals[:nsamples//2]
evals_off = evals[nsamples//2:][::-1]
# if j == 0:
# print(evals)
# print(np.vstack((X_masked[:,0], evals)).T)
# print(np.mean(evals[:nsamples//2] - evals[nsamples//2:]))
# print(evals_on - evals_off)
return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [412]:
class SampleExplainer(KernelExplainer):
"""
"""
def __init__(self, model, data, **kwargs):
# silence warning about large datasets
level = log.level
log.setLevel(logging.ERROR)
super(SampleExplainer, self).__init__(model, data, **kwargs)
log.setLevel(level)
self.model_null = np.mean(self.model.f(self.data.data))
def explain(self, incoming_instance, **kwargs):
# convert incoming input to a standardized iml object
instance = convert_to_instance(incoming_instance)
match_instance_to_data(instance, self.data)
# pick a reasonable number of samples if the user didn't specify how many they wanted
self.nsamples = kwargs.get("nsamples", 0)
if self.nsamples == 0:
self.nsamples = 1000 * self.P
model_out = self.model.f(instance.x)
# explain every instance
phi = np.zeros(self.P)
self.X_masked = np.zeros((self.nsamples, X.shape[1]))
self.X_mask = np.zeros((self.nsamples+1, X.shape[1]))
#print(instance.x)
size_weights = np.array([(self.P - 1)/(s * (self.P - s)) for s in range(1,self.P)])
size_weights /= size_weights.sum()
for i in range(self.nsamples):
# choose size of the subset
s = np.random.choice(len(size_weights), 1, p=size_weights) + 1
# choose subset
S = np.random.choice(M, s, replace=False)
rind = np.random.randint(self.data.data.shape[0])
#print(rind)
self.X_masked[i,:] = self.data.data[rind,:]
self.X_masked[i,S] = instance.x[0,S]
self.X_mask[i,S] = 1
y = np.zeros(self.nsamples+1)
y[:-1] = self.model.f(self.X_masked)
self.X_mask[-1,:] = 1
y[-1] = self.model.f(instance.x)[0]
y -= self.model_null
#print(y)
weights = np.ones(self.nsamples + 1) / (self.nsamples * 100)
weights[-1] = 0.99
#print(weights.sum())
tmp = (self.X_mask.T * weights)
phi = np.linalg.inv(tmp @ self.X_mask + np.eye(self.P) * 0.01 / self.nsamples) @ tmp @ y
#print(phi.sum())
#print("self.model_null", self.model_null)
#print(phi)
return AdditiveExplanation(self.link.f(1), self.link.f(1), phi, np.zeros(len(phi)), instance, self.link,
self.model, self.data)
# def ime(self, j, f, x, X, nsamples=10):
# assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
# # print("nsamples", nsamples, j)
# X_masked = self.X_masked[:nsamples,:]
# inds = np.arange(X.shape[1])
# for i in range(0, nsamples//2):
# np.random.shuffle(inds)
# pos = np.where(inds == j)[0][0]
# rind = np.random.randint(X.shape[0])
# X_masked[i,:] = x
# X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
# X_masked[-(i+1),:] = x
# X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
# evals = f(X_masked)
# evals_on = evals[:nsamples//2]
# evals_off = evals[nsamples//2:][::-1]
# # if j == 0:
# # print(evals)
# # print(np.vstack((X_masked[:,0], evals)).T)
# # print(np.mean(evals[:nsamples//2] - evals[nsamples//2:]))
# # print(evals_on - evals_off)
# return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [412]:
class SampleExplainer2(KernelExplainer):
"""
"""
def __init__(self, model, data, **kwargs):
# silence warning about large datasets
level = log.level
log.setLevel(logging.ERROR)
super(SampleExplainer, self).__init__(model, data, **kwargs)
log.setLevel(level)
self.model_null = np.mean(self.model.f(self.data.data))
def explain(self, incoming_instance, **kwargs):
# convert incoming input to a standardized iml object
instance = convert_to_instance(incoming_instance)
match_instance_to_data(instance, self.data)
# pick a reasonable number of samples if the user didn't specify how many they wanted
self.nsamples = kwargs.get("nsamples", 0)
if self.nsamples == 0:
self.nsamples = 1000 * self.P
model_out = self.model.f(instance.x)
# explain every instance
phi = np.zeros(self.P)
self.X_masked = np.zeros((self.nsamples, X.shape[1]))
self.X_mask = np.zeros((self.nsamples+1, X.shape[1]))
#print(instance.x)
size_weights = np.array([(self.P - 1)/(s * (self.P - s)) for s in range(1,self.P)])
size_weights /= size_weights.sum()
for i in range(self.nsamples):
# choose size of the subset
s = np.random.choice(len(size_weights), 1, p=size_weights) + 1
# choose subset
S = np.random.choice(M, s, replace=False)
rind = np.random.randint(self.data.data.shape[0])
#print(rind)
self.X_masked[i,:] = self.data.data[rind,:]
self.X_masked[i,S] = instance.x[0,S]
self.X_mask[i,S] = 1
y = np.zeros(self.nsamples+1)
y[:-1] = self.model.f(self.X_masked)
self.X_mask[-1,:] = 1
y[-1] = self.model.f(instance.x)[0]
y -= self.model_null
#print(y)
weights = np.ones(self.nsamples + 1) / (self.nsamples * 100)
weights[-1] = 0.99
#print(weights.sum())
tmp = (self.X_mask.T * weights)
phi = np.linalg.inv(tmp @ self.X_mask + np.eye(self.P) * 0.01 / self.nsamples) @ tmp @ y
#print(phi.sum())
#print("self.model_null", self.model_null)
#print(phi)
return AdditiveExplanation(self.link.f(1), self.link.f(1), phi, np.zeros(len(phi)), instance, self.link,
self.model, self.data)
# def ime(self, j, f, x, X, nsamples=10):
# assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
# # print("nsamples", nsamples, j)
# X_masked = self.X_masked[:nsamples,:]
# inds = np.arange(X.shape[1])
# for i in range(0, nsamples//2):
# np.random.shuffle(inds)
# pos = np.where(inds == j)[0][0]
# rind = np.random.randint(X.shape[0])
# X_masked[i,:] = x
# X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
# X_masked[-(i+1),:] = x
# X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
# evals = f(X_masked)
# evals_on = evals[:nsamples//2]
# evals_off = evals[nsamples//2:][::-1]
# # if j == 0:
# # print(evals)
# # print(np.vstack((X_masked[:,0], evals)).T)
# # print(np.mean(evals[:nsamples//2] - evals[nsamples//2:]))
# # print(evals_on - evals_off)
# return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [406]:
SampleExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)
Out[406]:
In [ ]:
correct = KernelExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 100000)
In [ ]:
for i in range(10):
phi1 = SampleExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)
phi2 = IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000//2)
# print("phi1", phi1)
# print("phi2", phi2)
# print("correct", correct)
print(np.abs(correct[:-1] - phi1[:-1]).sum())
print(np.abs(correct[:-1] - phi2[:-1]).sum())
print()
In [ ]:
In [428]:
[IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)]
Out[428]:
In [427]:
IMEExplainer(f, Xm).shap_values(Xm[0,:], nsamples = 100)
Out[427]:
In [425]:
shap.TreeExplainer(model).shap_values(Xm[0:1,:])
Out[425]:
In [421]:
shap.KernelExplainer(model).shap_values(Xm[0:1,:])
Out[421]:
In [419]:
shap_values[0,:]
Out[419]:
In [407]:
SampleExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)
Out[407]:
In [417]:
dist_ime = np.array([SampleExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)[0] for i in range(100)])
dist_ime.std()
Out[417]:
In [416]:
dist_ime = np.array([IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 1000)[0] for i in range(100)])
dist_ime.std()
Out[416]:
In [ ]:
[(M - 1)/(s * (M - s)) for s in range(M+1)]
In [314]:
M = X.shape[1]
size_weights = np.array([(M - 1)/(s * (M - s)) for s in range(1,M)])
size_weights /= size_weights.sum()
for i in range(10):
# choose size of the subset
s = np.random.choice(len(size_weights), 1, p=size_weights) + 1
# choose subset
S = np.random.choice(M, s, replace=False)
rind = np.random.randint(X.shape[0]
print(S)
In [312]:
len(size_weights),M
M =
size_weights = np.array([(M - 1)/(s * (M - s)) for s in range(1,M)])
[0, 1, 2, 2.75, 3.33, 3.80555]
Out[312]:
In [303]:
#M = 3
(M - 1)/(M-2)
Out[303]:
In [276]:
dist_ime = np.array([IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 100)[0] for i in range(100)])
dist_ime.std()
Out[276]:
In [276]:
dist_ime = np.array([IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 100)[0] for i in range(100)])
dist_ime.std()
Out[276]:
In [415]:
dist_ime = np.array([IMEExplainer(f, Xm).shap_values(Xm[0,:], nsamples = 1000)[0] for i in range(100)])
dist_ime.std()
Out[415]:
In [414]:
dist_ime = np.array([SampleExplainer(f, Xm).shap_values(Xm[0,:], nsamples = 1000)[0] for i in range(100)])
dist_ime.std()
Out[414]:
In [ ]:
In [269]:
inds = np.arange(X.shape[0])
n_iter = 100
dist_kernel = np.zeros(n_iter)
for i in range(n_iter):
np.random.shuffle(inds)
dist_kernel[i] = KernelExplainer(f, Xm[inds[10:20],:]).shap_values(Xm[0,:], nsamples = 1000)[0]
dist_kernel.std()
Out[269]:
In [269]:
inds = np.arange(X.shape[0])
n_iter = 100
dist_kernel = np.zeros(n_iter)
for i in range(n_iter):
np.random.shuffle(inds)
dist_kernel[i] = KernelExplainer(f, Xm[inds[10:20],:]).shap_values(Xm[0,:], nsamples = 1000)[0]
dist_kernel.std()
Out[269]:
In [274]:
inds = np.arange(X.shape[0])
n_iter = 100
dist_kernel = np.zeros(n_iter)
for i in range(n_iter):
#np.random.shuffle(inds)
dist_kernel[i] = KernelExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 100)[0]
dist_kernel.std()
Out[274]:
In [275]:
inds = np.arange(X.shape[0])
n_iter = 100
dist_kernel = np.zeros(n_iter)
for i in range(n_iter):
np.random.shuffle(inds)
for j in range(10):
dist_kernel[i] += KernelExplainer(f, Xm[inds[0]:inds[0]+1,:]).shap_values(Xm[0,:], nsamples = 100)[0]
dist_kernel[i] /= j
dist_kernel.std()
Out[275]:
In [282]:
inds = np.arange(X.shape[0])
n_iter = 100
dist_kernel = np.zeros(n_iter)
for i in range(n_iter):
#np.random.shuffle(inds)
for j in range(10):
dist_kernel[i] += IMEExplainer(f, Xm[inds[0]:inds[0]+1,:]).shap_values(Xm[0,:], nsamples = 100)[0]
dist_kernel[i] /= j
dist_kernel.std()
Out[282]:
In [ ]:
dist_ime = np.array([IMEExplainer(f, np.zeros((1,X.shape[1]))).shap_values(Xm[0,:], nsamples = 100)[0] for i in range(100)])
dist_ime.std()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [150]:
pl.hist(dist)
pl.show()
In [153]:
y = f(X.as_matrix())
np.linalg.inv(X.as_matrix().T @ X.as_matrix()) @ X.as_matrix().T @ (y - y.mean())
Out[153]:
In [158]:
y = f(X.as_matrix())
Xm = X.as_matrix()
np.linalg.inv(Xm.T @ Xm) @ Xm.T @ (y - 10)
Out[158]:
In [159]:
np.linalg.inv(Xm.T @ Xm) @ Xm.T @ np.ones(len(y))
Out[159]:
In [161]:
Xm.mean(0)
Out[161]:
In [120]:
X.iloc[0:1,:]
Out[120]:
In [169]:
shap_values[3,:]
Out[169]:
In [ ]:
xgboost.
In [173]:
y = f(X.as_matrix())
In [175]:
pl.plot(Xm[:,0], y, ".")
Out[175]:
In [177]:
for i in range(X.shape[1]):
pl.plot(Xm[:,i], y, ".")
pl.show()
In [110]:
shap.dependence_plot(0, shap_values, X, interaction_index=1)
In [163]:
shap.summary_plot(shap_values, X)
In [102]:
pl.hist(X.iloc[:,0])
pl.show()
In [90]:
for i in range(X.shape[1]):
pl.hist(X.iloc[:,i])
pl.show()
In [18]:
shap.kmeans(X, 10)
Out[18]:
In [47]:
shap.KernelExplainer(f, X).shap_values(X.iloc[0:1,:], nsamples=100000)
Out[47]:
In [128]:
shap.KernelExplainer(f, shap.kmeans(X, 5)).shap_values(X.iloc[0:1,:], nsamples=10040)
Out[128]:
In [129]:
inds = np.arange(X.shape[0])
np.random.shuffle(inds)
In [147]:
inds = np.arange(X.shape[0])
phi = np.zeros(X.shape[1]+1)
for i in range(1):
np.random.shuffle(inds)
phi += shap.KernelExplainer(f, X.iloc[inds[:1000],:]).shap_values(X.iloc[0,:], nsamples=10000)
phi / 1
Out[147]:
In [146]:
inds = np.arange(X.shape[0])
phi = np.zeros(X.shape[1]+1)
for i in range(1000):
np.random.shuffle(inds)
phi += shap.KernelExplainer(f, X.iloc[inds[:1],:]).shap_values(X.iloc[0,:], nsamples=10000)
phi / 1000
Out[146]:
In [32]:
shap.KernelExplainer(f, X.iloc[inds[:1000],:]).shap_values(X.iloc[0:1,:], nsamples=500)
Out[32]:
In [95]:
[ime(j, f, X.as_matrix()[0,:], X.as_matrix(), nsamples=1000) for j in range(X.shape[1])]
Out[95]:
In [21]:
shap_values[0,:]
Out[21]:
In [114]:
shap.summary_plot(shap_values, X, show=False)
pl.savefig("data/nhanes_summary.pdf", dpi=400)
pl.show()
In [7]:
# shap.summary_plot(shap_values, X, show=False)
# pl.gca().set_rasterized(True)
# pl.savefig("data/nhanes_summary.pdf", dpi=400)
In [81]:
shap.dependence_plot("BMI", shap_values, X, show=False, interaction_index="BMI")
pl.xlim(15,50)
pl.gcf().set_size_inches(5.5, 5)
#pl.savefig("data/nhanes_bmi.pdf")
pl.show()
In [102]:
shap.dependence_plot("Systolic BP", shap_values, X, show=False)
pl.xlim(80,225)
pl.ylim(-0.4,0.8)
pl.savefig("data/nhanes_sbp.pdf", dpi=400)
pl.show()
In [7]:
shap.dependence_plot("Pulse pressure", shap_values, X, show=False)
#pl.xlim(80,225)
# pl.savefig("data/nhanes_sbp.pdf")
# pl.show()
In [91]:
start = time.time()
shap_interaction_values = model.predict(xgboost.DMatrix(X.iloc[:,:]), pred_interactions=True)
time.time() - start
Out[91]:
In [131]:
class IMEExplainer:
""" This is an implementation of the IME explanation method (aka. Shapley sampling values)
This is implemented here for comparision and evaluation purposes, the KernelExplainer is more
typically more efficient and so is the preferred model agnostic estimation method in this package.
IME was proposed in "An Efficient Explanation of Individual Classifications using Game Theory",
Erik Štrumbelj, Igor Kononenko, JMLR 2010
"""
def __init__(self, model, data, **kwargs):
self.model = convert_to_model(model)
self.keep_index = kwargs.get("keep_index", False)
self.data = convert_to_data(data, keep_index=self.keep_index)
match_model_to_data(self.model, self.data)
# enforce our current input type limitations
assert isinstance(self.data, DenseData), "Shap explainer only supports the DenseData input currently."
assert not self.data.transposed, "Shap explainer does not support transposed DenseData currently."
# init our parameters
self.N = self.data.data.shape[0]
self.P = self.data.data.shape[1]
self.nsamplesAdded = 0
self.nsamplesRun = 0
def shap_values(self, X, **kwargs):
# convert dataframes
if str(type(X)).endswith("pandas.core.series.Series'>"):
X = X.as_matrix()
elif str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
if self.keep_index:
index_value = X.index.values
index_name = X.index.name
column_name = list(X.columns)
X = X.as_matrix()
assert str(type(X)).endswith("'numpy.ndarray'>"), "Unknown instance type: " + str(type(X))
assert len(X.shape) == 1 or len(X.shape) == 2, "Instance must have 1 or 2 dimensions!"
# pick a reasonable number of samples if the user didn't specify how many they wanted
nsamples = kwargs.get("nsamples", 0)
if nsamples == 0:
nsamples = 1000 * self.P
# divide up the samples among the features
nsamples_each = np.ones(self.P) * 2 * (nsamples // (self.P * 2))
for i in range((nsamples % (self.P * 2)) // 2):
nsamples_each[i] += 2
# single instance
if len(X.shape) == 1:
data = X.reshape((1, X.shape[0]))
if self.keep_index:
data = convert_to_instance_with_index(data, column_name, index_name, index_value)
explanation = self.explain(data, **kwargs)
# vector-output
s = explanation.effects.shape
if len(s) == 2:
outs = [np.zeros(s[0] + 1) for j in range(s[1])]
for j in range(s[1]):
outs[j][:-1] = explanation.effects[:, j]
outs[j][-1] = explanation.base_value[j]
return outs
# single-output
else:
out = np.zeros(s[0] + 1)
out[:-1] = explanation.effects
out[-1] = explanation.base_value
return out
# explain the whole dataset
elif len(X.shape) == 2:
explanations = []
for i in tqdm(range(X.shape[0])):
data = X[i:i + 1, :]
if self.keep_index:
data = convert_to_instance_with_index(data, column_name, index_value[i:i + 1], index_name)
explanations.append(self.explain(data, **kwargs))
# vector-output
s = explanations[0].effects.shape
if len(s) == 2:
outs = [np.zeros((X.shape[0], s[0] + 1)) for j in range(s[1])]
for i in range(X.shape[0]):
for j in range(s[1]):
outs[j][i, :-1] = explanations[i].effects[:, j]
outs[j][i, -1] = explanations[i].base_value[j]
return outs
# single-output
else:
out = np.zeros((X.shape[0], s[0] + 1))
for i in range(X.shape[0]):
out[i, :-1] = explanations[i].effects
out[i, -1] = explanations[i].base_value
return out
def ime(j, f, x, X, nsamples=10):
assert nsamples % 2 == 0, "nsamples must be divisible by 2!"
X_masked = np.zeros((nsamples, X.shape[1]))
inds = np.arange(X.shape[1])
for i in range(0, nsamples//2):
pos = np.where(inds == j)[0][0]
rind = np.random.randint(X.shape[0])
X_masked[i,:] = x
X_masked[i,inds[pos+1:]] = X[rind,inds[pos+1:]]
# print(i)
# print(-(i+1))
X_masked[-(i+1),:] = x
X_masked[-(i+1),inds[pos:]] = X[rind,inds[pos:]]
# print(X_masked)
evals = f(X_masked)
return np.mean(evals[:nsamples//2] - evals[nsamples//2:])
In [56]:
np.exp(pred[i]),np.exp(pred[i:]).sum()
Out[56]:
In [57]:
pred
Out[57]:
In [62]:
?np.sort
In [75]:
pred = model.predict(xgb_full, output_margin=True)
pred = np.flip(np.sort(pred),axis=0)
C = 0.001
tmp = [np.log(np.exp(pred[i]+C)/(np.exp(pred[i+1:]).sum() + np.exp(pred[i]+C))) - np.log(np.exp(pred[i])/np.exp(pred[i:]).sum()) for i in range(1000)]
pl.plot(tmp)
Out[75]:
In [ ]:
def scaled(A, B, C):
return A*C/(B + A*C)
In [41]:
B = 100
C = 0.01
out = np.zeros(100)
proportion = np.zeros(100)
As = np.linspace(-20,20,100)
for i,A in enumerate(As):
proportion[i] = np.exp(A)/(B+np.exp(A))
out[i] = A + C - np.log(B + np.exp(A)*np.exp(C)) - (A - np.log(B + np.exp(A)))
In [42]:
pl.plot(proportion,out)
pl.title("Asdf")
Out[42]:
In [43]:
xs = np.linspace(-10,10,100)
pl.plot(xs, np.log(1/(1+np.exp(-xs+C))) - np.log(1/(1+np.exp(-xs))))
Out[43]:
In [ ]:
np.log(1) - np.log(1+np.exp(-xs+C))
In [104]:
shap.dependence_plot(
("Systolic BP", "Systolic BP"),
shap_interaction_values, X.iloc[:,:],
display_features=X_display.iloc[:,:],
show=False
)
pl.xlim(80,225)
pl.ylim(-0.4,0.8)
pl.ylabel("SHAP main effect value for\nSystolic BP")
pl.gcf().set_size_inches(6, 5)
pl.savefig("data/nhanes_sbp_main_effect.pdf", dpi=400)
pl.show()
In [111]:
def dependence_plot2(ind, shap_values, features, feature_names=None, display_features=None,
interaction_index="auto", color="#ff0052", axis_color="#333333",
dot_size=16, alpha=1, title=None, show=True):
"""
Create a SHAP dependence plot, colored by an interaction feature.
Parameters
----------
ind : int
Index of the feature to plot.
shap_values : numpy.array
Matrix of SHAP values (# samples x # features)
features : numpy.array or pandas.DataFrame
Matrix of feature values (# samples x # features)
feature_names : list
Names of the features (length # features)
display_features : numpy.array or pandas.DataFrame
Matrix of feature values for visual display (such as strings instead of coded values)
interaction_index : "auto", None, or int
The index of the feature used to color the plot.
"""
# convert from DataFrames if we got any
if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
if feature_names is None:
feature_names = features.columns
features = features.as_matrix()
if str(type(display_features)) == "<class 'pandas.core.frame.DataFrame'>":
if feature_names is None:
feature_names = display_features.columns
display_features = display_features.as_matrix()
elif display_features is None:
display_features = features
if feature_names is None:
feature_names = ["Feature "+str(i) for i in range(shap_values.shape[1]-1)]
# allow vectors to be passed
if len(shap_values.shape) == 1:
shap_values = np.reshape(shap_values, len(shap_values), 1)
if len(features.shape) == 1:
features = np.reshape(features, len(features), 1)
def convert_name(ind):
if type(ind) == str:
nzinds = np.where(feature_names == ind)[0]
if len(nzinds) == 0:
print("Could not find feature named: "+ind)
return None
else:
return nzinds[0]
else:
return ind
ind = convert_name(ind)
# plotting SHAP interaction values
if len(shap_values.shape) == 3 and len(ind) == 2:
ind1 = convert_name(ind[0])
ind2 = convert_name(ind[1])
if ind1 == ind2:
proj_shap_values = shap_values[:,ind2,:]
else:
proj_shap_values = shap_values[:,ind2,:] * 2 # off-diag values are split in half
dependence_plot2(
ind1, proj_shap_values, features, feature_names=feature_names,
interaction_index=ind2, display_features=display_features, show=False
)
if ind1 == ind2:
pl.ylabel("SHAP main effect value for\n"+feature_names[ind1])
else:
pl.ylabel("SHAP interaction value for\n"+feature_names[ind1]+" and "+feature_names[ind2])
if show:
pl.show()
return
# get both the raw and display feature values
xv = features[:,ind]
xd = display_features[:,ind]
s = shap_values[:,ind]
if type(xd[0]) == str:
name_map = {}
for i in range(len(xv)):
name_map[xd[i]] = xv[i]
xnames = list(name_map.keys())
# allow a single feature name to be passed alone
if type(feature_names) == str:
feature_names = [feature_names]
name = feature_names[ind]
# guess what other feature as the stongest interaction with the plotted feature
if interaction_index == "auto":
interaction_index = approx_interactions(ind, shap_values, features)[0]
interaction_index = convert_name(interaction_index)
# get both the raw and display color values
cv = features[:,interaction_index]
cd = display_features[:,interaction_index]
categorical_interaction = False
clow = np.nanpercentile(features[:,interaction_index], 5)
chigh = np.nanpercentile(features[:,interaction_index], 95)
if type(cd[0]) == str:
cname_map = {}
for i in range(len(cv)):
cname_map[cd[i]] = cv[i]
cnames = list(cname_map.keys())
categorical_interaction = True
elif clow % 1 == 0 and chigh % 1 == 0 and len(set(features[:,interaction_index])) < 50:
categorical_interaction = True
# discritize colors for categorical features
color_norm = None
if categorical_interaction and clow != chigh:
bounds = np.linspace(clow, chigh, chigh-clow+2)
color_norm = matplotlib.colors.BoundaryNorm(bounds, red_blue.N)
# the actual scatter plot, TODO: adapt the dot_size to the number of data points?
pl.scatter(xv, s, s=dot_size, linewidth=0, c="#1E88E5",
alpha=alpha, rasterized=len(xv) > 500)
if interaction_index != ind:
# draw the color bar
norm = None
if type(cd[0]) == str:
tick_positions = [cname_map[n] for n in cnames]
if len(tick_positions) == 2:
tick_positions[0] -= 0.25
tick_positions[1] += 0.25
cb = pl.colorbar(ticks=tick_positions)
cb.set_ticklabels(cnames)
else:
cb = pl.colorbar()
cb.set_label(feature_names[interaction_index], size=13)
cb.ax.tick_params(labelsize=11)
if categorical_interaction:
cb.ax.tick_params(length=0)
cb.set_alpha(1)
cb.outline.set_visible(False)
bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
cb.ax.set_aspect((bbox.height-0.7)*20)
# make the plot more readable
if interaction_index != ind:
pl.gcf().set_size_inches(7.5, 5)
else:
pl.gcf().set_size_inches(6, 5)
pl.xlabel(name, color=axis_color, fontsize=13)
pl.ylabel("SHAP value for\n"+name, color=axis_color, fontsize=13)
if title != None:
pl.title(title, color=axis_color, fontsize=13)
pl.gca().xaxis.set_ticks_position('bottom')
pl.gca().yaxis.set_ticks_position('left')
pl.gca().spines['right'].set_visible(False)
pl.gca().spines['top'].set_visible(False)
pl.gca().tick_params(color=axis_color, labelcolor=axis_color, labelsize=11)
for spine in pl.gca().spines.values():
spine.set_edgecolor(axis_color)
if type(xd[0]) == str:
pl.xticks([name_map[n] for n in xnames], xnames, rotation='vertical', fontsize=11)
if show:
pl.show()
In [112]:
dependence_plot2(
("Systolic BP", "Systolic BP"),
shap_interaction_values, X.iloc[:,:],
display_features=X_display.iloc[:,:],
show=False
)
pl.xlim(80,225)
pl.ylim(-0.4,0.8)
pl.ylabel("SHAP main effect value for\nSystolic BP")
pl.gcf().set_size_inches(6, 5)
pl.savefig("data/nhanes_sbp_main_effect.pdf", dpi=400)
pl.show()
In [100]:
shap.dependence_plot(
("Systolic BP", "Age"),
shap_interaction_values, X.iloc[:,:],
display_features=X_display.iloc[:,:],
show=False
)
pl.xlim(80,225)
pl.ylim(-0.4,0.8)
pl.savefig("data/nhanes_sbp_age_interaction.pdf", dpi=400)
pl.show()
In [5]:
shap.dependence_plot(
("Age", "Sex"),
shap_interaction_values, X.iloc[:1000,:],
display_features=X_display.iloc[:1000,:]
)
In [56]:
shap.dependence_plot(0, shap_interaction_values[:,4,:], X.iloc[:1000,:], interaction_index=4, show=False)
pl.ylabel("SHAP interaction value for\nAge and Serum Cholesterol")
pl.show()
In [58]:
shap.dependence_plot(4, shap_interaction_values[:,0,:], X.iloc[:1000,:], interaction_index=0, show=False)
pl.ylabel("SHAP interaction value for\nAge and Serum Cholesterol")
pl.show()
In [37]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
shap_pca2 = PCA(n_components=2).fit_transform(shap_values[:,:-1])
In [13]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_pca2[:,0], shap_pca2[:,1], c=np.sum(shap_values,axis=1), linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [13]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_pca2[:,0], shap_pca2[:,1], c=np.sum(shap_values,axis=1), linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [23]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_pca2[:,0], shap_pca2[:,1], c=X["Age"], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [24]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_pca2[:,0], shap_pca2[:,1], c=X["Sex"], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [15]:
shap_pca2 = PCA(n_components=2).fit(shap_values[:,:-1])
In [17]:
?shap_pca2
In [19]:
shap_pca2.components_.round(2)
Out[19]:
In [39]:
shap_embedded = TSNE(n_components=2, perplexity=50).fit_transform(shap_values[:1000,:-1])
In [32]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_embedded[:,0], shap_embedded[:,1], c=np.sum(shap_values[:1000,:],axis=1), linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [34]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_embedded[:,0], shap_embedded[:,1], c=X["Age"][:1000], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [35]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_embedded[:,0], shap_embedded[:,1], c=X["Sex"][:500], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [36]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_embedded[:,0], shap_embedded[:,1], c=X["Systolic BP"][:500], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [38]:
f = pl.figure(figsize=(5,5))
pl.scatter(shap_pca2[:,0], shap_pca2[:,1], c=X["Systolic BP"], linewidth=0, alpha=0.5, cmap=shap.plots.red_blue)
cb = pl.colorbar(label="Model output", aspect=40, orientation="horizontal")
cb.set_alpha(1)
cb.draw_all()
cb.outline.set_linewidth(0)
cb.ax.tick_params('x', length=0)
cb.ax.xaxis.set_label_position('top')
pl.gca().axis("off")
pl.show()
In [ ]: